{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "947c906f-136a-4f3c-9345-c57f0ed34124",
   "metadata": {},
   "source": [
    "# 划分训练集和测试集\n",
    "\n",
    "张子豪 2023-2-22"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "11c7d96b-19a4-4236-bf1f-2977f2dcba99",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import shutil\n",
    "import random\n",
    "\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "23aeda88-1d70-40ca-b895-6f57a84dda58",
   "metadata": {},
   "outputs": [],
   "source": [
    "Dataset_root = 'SJB_25_Dataset'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ab791fbe-21dc-405f-9072-b157e9047686",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir(os.path.join(Dataset_root, 'labelme_jsons'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f112267b-1044-45aa-9ad2-f2752a4b80e9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "共有 25 个 labelme 格式的 json 文件\n"
     ]
    }
   ],
   "source": [
    "print('共有 {} 个 labelme 格式的 json 文件'.format(len(os.listdir())))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b04360b9-c43b-4a01-a093-8908be5ea0a8",
   "metadata": {},
   "source": [
    "## 删除系统自动生成的多余文件"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f049f6ef-d54d-4d1d-83e7-a8d23bcb85a0",
   "metadata": {},
   "source": [
    "### 查看待删除的多余文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "23c17bcc-bcbc-4ed1-a829-1b63e99d2b80",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '__MACOSX'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "69da9d50-11d1-4071-b818-dbb065a6474e",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '.DS_Store'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "d4d6ca60-2948-4105-b635-e0d390814067",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '.ipynb_checkpoints'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fedea526-5aac-42a6-b058-61b4ae4afe81",
   "metadata": {},
   "source": [
    "### 删除多余文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "8d959813-5703-4e01-ba3e-f27b207d59f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "!for i in `find . -iname '__MACOSX'`; do rm -rf $i;done"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d2ea0bef-6954-44d5-8478-680c8d7eded1",
   "metadata": {},
   "outputs": [],
   "source": [
    "!for i in `find . -iname '.DS_Store'`; do rm -rf $i;done"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "7a2b568c-2685-4a13-90f6-185f60066360",
   "metadata": {},
   "outputs": [],
   "source": [
    "!for i in `find . -iname '.ipynb_checkpoints'`; do rm -rf $i;done"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c0f41291-369b-4dd2-8265-8b54a980f6ea",
   "metadata": {},
   "source": [
    "### 验证多余文件已删除"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "2dea6d10-e56f-4775-a055-933d09a9adde",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '__MACOSX'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "23d47236-e846-42bd-9856-a0dd07a23621",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '.DS_Store'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "a85571be-399b-4e6e-b00e-f3fea415a17e",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '.ipynb_checkpoints'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "78005455-4d7b-4c61-8197-0440c6d72069",
   "metadata": {},
   "source": [
    "## 划分训练集、测试集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "9559d312-7353-47e8-8207-84f31a229b9c",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_frac = 0.2  # 测试集比例\n",
    "random.seed(123) # 随机数种子，便于复现"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "02992d76-09cf-43ac-8d39-03f18b04a12a",
   "metadata": {},
   "outputs": [],
   "source": [
    "folder = '.'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "9edb977a-5a88-45ff-923d-0865c8b1421c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "数据集文件总数 25\n",
      "训练集文件个数 20\n",
      "测试集文件个数 5\n"
     ]
    }
   ],
   "source": [
    "img_paths = os.listdir(folder)\n",
    "random.shuffle(img_paths) # 随机打乱\n",
    "\n",
    "val_number = int(len(img_paths) * test_frac) # 测试集文件个数\n",
    "train_files = img_paths[val_number:]         # 训练集文件名列表\n",
    "val_files = img_paths[:val_number]           # 测试集文件名列表\n",
    "\n",
    "print('数据集文件总数', len(img_paths))\n",
    "print('训练集文件个数', len(train_files))\n",
    "print('测试集文件个数', len(val_files))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "53ab77c4-51e4-4236-9003-a2b4f26fca46",
   "metadata": {},
   "source": [
    "## 将训练集json文件移动到`train_labelme_jsons`文件夹"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "b44776b2-3b69-4451-a59b-115eaea97395",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 创建文件夹，存放训练集的 labelme格式的 json 标注文件\n",
    "train_labelme_jsons_folder = 'train_labelme_jsons'\n",
    "os.mkdir(train_labelme_jsons_folder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "66bdba3f-4bfb-4865-a427-e330fefec387",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 20/20 [00:00<00:00, 1832.81it/s]\n"
     ]
    }
   ],
   "source": [
    "for each in tqdm(train_files):\n",
    "    src_path = os.path.join(folder, each)\n",
    "    dst_path = os.path.join(train_labelme_jsons_folder, each)\n",
    "    shutil.move(src_path, dst_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "375ab554-433a-42a9-b242-fb7f81283db2",
   "metadata": {},
   "source": [
    "## 将测试集json文件移动到`val_labelme_jsons`文件夹"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "d643ef3e-27b2-4648-a0e1-d5b0136314b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 创建文件夹，存放训练集的 labelme格式的 json 标注文件\n",
    "val_labelme_jsons_folder = 'val_labelme_jsons'\n",
    "os.mkdir(val_labelme_jsons_folder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "080bbc39-b982-4dd5-bead-d2c1901b7edf",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5/5 [00:00<00:00, 1476.14it/s]\n"
     ]
    }
   ],
   "source": [
    "for each in tqdm(val_files):\n",
    "    src_path = os.path.join(folder, each)\n",
    "    dst_path = os.path.join(val_labelme_jsons_folder, each)\n",
    "    shutil.move(src_path, dst_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "13a71eb1-386d-4948-99a5-9fe4b97fbfb4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "25"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(os.listdir(train_labelme_jsons_folder)) + len(os.listdir(val_labelme_jsons_folder))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "7ed6edd8-6a70-4a61-9908-e6859b552b06",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir('../../')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ecd6fcf0-64f5-4477-85dd-4b38757ee8e2",
   "metadata": {},
   "source": [
    "## 删除系统自动生成的多余文件"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "741a463d-c631-4e26-bc71-9ee23314338e",
   "metadata": {},
   "source": [
    "### 查看待删除的多余文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "6ece8a4a-6812-43fe-97cf-6320c6b29702",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '__MACOSX'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "734d66a4-8440-487d-8bc7-6a1c5ed2ce9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '.DS_Store'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "60aa36a8-7804-4194-a47b-43013f2404ad",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "./【C】Labelme转COCO-单个文件/.ipynb_checkpoints\n",
      "./.ipynb_checkpoints\n"
     ]
    }
   ],
   "source": [
    "!find . -iname '.ipynb_checkpoints'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e94bdc01-6f4e-4909-a875-48e2f3765877",
   "metadata": {},
   "source": [
    "### 删除多余文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "580a96df-baf5-4c80-8efc-5f9cffa08c84",
   "metadata": {},
   "outputs": [],
   "source": [
    "!for i in `find . -iname '__MACOSX'`; do rm -rf $i;done"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "65c8376b-50fb-46ab-9d8b-d9718556bb5c",
   "metadata": {},
   "outputs": [],
   "source": [
    "!for i in `find . -iname '.DS_Store'`; do rm -rf $i;done"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "3cf53f24-fca0-42e9-aa40-878cf280115a",
   "metadata": {},
   "outputs": [],
   "source": [
    "!for i in `find . -iname '.ipynb_checkpoints'`; do rm -rf $i;done"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "944e7e80-0cd6-4b5d-a7fc-ae716a827287",
   "metadata": {},
   "source": [
    "### 验证多余文件已删除"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "440cb364-adf7-439b-98f5-3dedc48e1888",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '__MACOSX'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "940e7fa1-ff61-452d-a900-2383c9a1b658",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '.DS_Store'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "7168766c-75eb-4664-8219-6d35aa755125",
   "metadata": {},
   "outputs": [],
   "source": [
    "!find . -iname '.ipynb_checkpoints'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "301d181a-5cf8-4889-88da-dd06753762b5",
   "metadata": {},
   "source": [
    "## 查看数据集目录结构"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "2f43c7d1-f461-4b3a-a622-5d965eead40d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📁 SJB_25_Dataset/\n",
      "├─📁 images/\n",
      "│ ├─📄 DSC_0219.jpg\n",
      "│ ├─📄 DSC_0209.jpg\n",
      "│ ├─📄 DSC_0236.jpg\n",
      "│ ├─📄 DSC_0293.jpg\n",
      "│ ├─📄 DSC_0245.jpg\n",
      "│ ├─📄 DSC_0278.jpg\n",
      "│ ├─📄 MVIMG_20230331_080908.jpg\n",
      "│ ├─📄 MVIMG_20230331_080920.jpg\n",
      "│ ├─📄 DSC_0285.jpg\n",
      "│ ├─📄 DSC_0284.jpg\n",
      "│ ├─📄 DSC_0280.jpg\n",
      "│ ├─📄 DSC_0281.jpg\n",
      "│ ├─📄 DSC_0240.jpg\n",
      "│ ├─📄 DSC_0283.jpg\n",
      "│ ├─📄 DSC_0269.jpg\n",
      "│ ├─📄 DSC_0282.jpg\n",
      "│ ├─📄 DSC_0259.jpg\n",
      "│ ├─📄 MVIMG_20230331_080914.jpg\n",
      "│ ├─📄 MVIMG_20230331_080915.jpg\n",
      "│ ├─📄 DSC_0289.jpg\n",
      "│ ├─📄 DSC_0260.jpg\n",
      "│ ├─📄 DSC_0274.jpg\n",
      "│ ├─📄 DSC_0301.jpg\n",
      "│ ├─📄 DSC_0265.jpg\n",
      "│ └─📄 MVIMG_20230331_080912.jpg\n",
      "└─📁 labelme_jsons/\n",
      "  ├─📁 train_labelme_jsons/\n",
      "  │ ├─📄 DSC_0283.json\n",
      "  │ ├─📄 DSC_0293.json\n",
      "  │ ├─📄 DSC_0301.json\n",
      "  │ ├─📄 DSC_0284.json\n",
      "  │ ├─📄 MVIMG_20230331_080908.json\n",
      "  │ ├─📄 DSC_0259.json\n",
      "  │ ├─📄 DSC_0289.json\n",
      "  │ ├─📄 MVIMG_20230331_080914.json\n",
      "  │ ├─📄 DSC_0209.json\n",
      "  │ ├─📄 MVIMG_20230331_080912.json\n",
      "  │ ├─📄 DSC_0285.json\n",
      "  │ ├─📄 DSC_0278.json\n",
      "  │ ├─📄 DSC_0281.json\n",
      "  │ ├─📄 MVIMG_20230331_080915.json\n",
      "  │ ├─📄 DSC_0240.json\n",
      "  │ ├─📄 MVIMG_20230331_080920.json\n",
      "  │ ├─📄 DSC_0282.json\n",
      "  │ ├─📄 DSC_0236.json\n",
      "  │ ├─📄 DSC_0269.json\n",
      "  │ └─📄 DSC_0219.json\n",
      "  └─📁 val_labelme_jsons/\n",
      "    ├─📄 DSC_0245.json\n",
      "    ├─📄 DSC_0260.json\n",
      "    ├─📄 DSC_0280.json\n",
      "    ├─📄 DSC_0274.json\n",
      "    └─📄 DSC_0265.json\n"
     ]
    }
   ],
   "source": [
    "import seedir as sd\n",
    "sd.seedir('SJB_25_Dataset', style='emoji', depthlimit=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d159293-0ebd-4438-b378-df547b0464e7",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
